Emorie D Beck
We’re going to work with a subsample of data from multiple studies. These are simulated versions of data from my dissertation simulated to mirror the underlying structure of each study, including their drawbacks.
load(url("https://github.com/emoriebeck/psc290-data-viz-2022/blob/main/04-week4-associations/04-data/week4-data.RData?raw=true"))
pred_data# A tibble: 10 × 4
Trait Outcome Moderator data
<chr> <chr> <chr> <list>
1 A mortality gender <tibble [5,022 × 25]>
2 A mortality none <tibble [5,022 × 25]>
3 C mortality gender <tibble [5,021 × 25]>
4 C mortality none <tibble [5,021 × 25]>
5 E mortality gender <tibble [5,021 × 25]>
6 E mortality none <tibble [5,021 × 25]>
7 N mortality gender <tibble [5,021 × 25]>
8 N mortality none <tibble [5,021 × 25]>
9 O mortality gender <tibble [5,019 × 25]>
10 O mortality none <tibble [5,019 × 25]>
# A tibble: 5,021 × 25
study o_value p_year SID p_value age gender grsWages parEdu race
<chr> <fct> <dbl> <chr> <dbl> <dbl> <fct> <dbl> <fct> <fct>
1 Study1 0 2005 61215 6.67 -29.9 1 1.02 2 0
2 Study1 0 2005 184965 0 -22.9 0 1.14 2 0
3 Study1 0 2005 488251 10 -3.92 1 0.717 1 0
4 Study1 0 2005 650779 7.22 -25.9 1 0.644 3 0
5 Study1 0 2005 969691 7.22 -0.925 1 0.812 2 0
6 Study1 0 2005 986687 6.11 14.1 0 1.76 2 0
7 Study1 0 2005 1054011 5.56 8.08 0 1.34 1 0
8 Study1 0 2005 1372251 7.78 5.08 1 0.842 1 0
9 Study1 0 2005 1496703 6.11 -23.9 0 1.42 2 0
10 Study1 0 2005 1897887 2.78 38.1 1 0.725 2 0
# … with 5,011 more rows, and 15 more variables: physhlthevnt <fct>,
# SRhealth <dbl>, smokes <fct>, alcohol <fct>, exercise <dbl>, BMI <dbl>,
# parDivorce <fct>, PhysFunc <fct>, religion <fct>, education <fct>,
# married <fct>, numKids <dbl>, parOccPrstg <dbl>, reliability <dbl>,
# predInt <dbl>
p_value) and self-rated helath (SRhealth)# A tibble: 5,021 × 4
study SID p_value SRhealth
<chr> <chr> <dbl> <dbl>
1 Study1 61215 6.67 9.23
2 Study1 184965 0 7.5
3 Study1 488251 10 6.43
4 Study1 650779 7.22 6.92
5 Study1 969691 7.22 5.77
6 Study1 986687 6.11 6.84
7 Study1 1054011 5.56 6
8 Study1 1372251 7.78 3.62
9 Study1 1496703 6.11 6.59
10 Study1 1897887 2.78 5.62
# … with 5,011 more rows
Let’s look at a basic scatterplot of the association
pred_data$data[[4]] %>%
select(study, SID, p_value, SRhealth) %>%
ggplot(aes(x = p_value, y = SRhealth)) +
geom_point(shape = 21, fill = "grey80", color = "black", size = 2) +
geom_smooth(method = "lm", size = 3, se = F) +
labs(
x = "Conscientiousness (POMP; 0-10)"
, y = "Self-Rated Health (POMP; 0-10)"
) +
theme_classic()pred_data$data[[4]] %>%
select(study, SID, p_value, SRhealth) %>%
filter(!is.na(SRhealth)) %>%
ggplot(aes(x = p_value, y = SRhealth)) +
geom_point(shape = 21, fill = "grey80", color = "black", size = 2) +
scale_fill_manual(values = c("grey80", "seagreen4")) +
facet_wrap(~study) +
labs(
x = "Conscientiousness (POMP; 0-10)"
, y = "Self-Rated Health (POMP; 0-10)"
) +
theme_classic()pred_data$data[[4]] %>%
select(study, SID, p_value, SRhealth) %>%
filter(!is.na(SRhealth)) %>%
ggplot(aes(x = p_value, y = SRhealth)) +
geom_point(shape = 21, fill = "grey80", color = "black", size = 2, alpha = .25) +
geom_smooth(method = "lm", size = 3, se = F) +
scale_fill_manual(values = c("grey80", "seagreen4")) +
facet_wrap(~study) +
labs(
x = "Conscientiousness (POMP; 0-10)"
, y = "Self-Rated Health (POMP; 0-10)"
) +
theme_classic()pred_data$data[[4]] %>%
select(study, SID, p_value, SRhealth) %>%
filter(!is.na(SRhealth)) %>%
ggplot(aes(x = p_value, y = SRhealth)) +
geom_point(shape = 21, fill = "grey80", color = "black", size = 2, alpha = .25) +
geom_smooth(method = "lm", size = 1.5, se = T, color = "black") +
scale_fill_manual(values = c("grey80", "seagreen4")) +
facet_wrap(~study) +
labs(
x = "Conscientiousness (POMP; 0-10)"
, y = "Self-Rated Health (POMP; 0-10)"
, title = "Conscientiousness -- Self-Rated Health Associations Across Samples"
) +
theme_classic()R packages for this, but where’s the fun in that?r_data <- pred_data$data[[4]] %>%
select(study, p_value, age, gender, SRhealth, smokes, exercise, BMI, education, parEdu, mortality = o_value) %>%
mutate_if(is.factor, ~as.numeric(as.character(.))) %>%
group_by(study) %>%
nest() %>%
ungroup() %>%
mutate(r = map(data, ~cor(., use = "pairwise")))
r_data# A tibble: 6 × 3
study data r
<chr> <list> <list>
1 Study1 <tibble [831 × 10]> <dbl [10 × 10]>
2 Study2 <tibble [1,000 × 10]> <dbl [10 × 10]>
3 Study3 <tibble [1,000 × 10]> <dbl [10 × 10]>
4 Study4 <tibble [574 × 10]> <dbl [10 × 10]>
5 Study5 <tibble [616 × 10]> <dbl [10 × 10]>
6 Study6 <tibble [1,000 × 10]> <dbl [10 × 10]>
cor() produces p_value age gender SRhealth smokes
p_value 1.000000000 -0.005224085 0.053627861 0.15917525 -0.069013463
age -0.005224085 1.000000000 -0.057243245 -0.22438335 -0.078788619
gender 0.053627861 -0.057243245 1.000000000 -0.03182278 0.022275557
SRhealth 0.159175251 -0.224383351 -0.031822781 1.00000000 -0.129241536
smokes -0.069013463 -0.078788619 0.022275557 -0.12924154 1.000000000
exercise 0.048576025 -0.361768736 0.061659017 0.34546038 -0.155018841
BMI -0.019741798 0.036151816 0.012217132 -0.09340105 -0.037713371
education 0.001465775 -0.173399716 -0.001603648 0.11008540 -0.096936630
parEdu 0.019871078 -0.374733606 0.055468171 0.08273023 0.005215303
mortality -0.089637524 0.627069166 -0.092109448 -0.31142292 0.035759332
exercise BMI education parEdu mortality
p_value 0.04857602 -0.01974180 0.001465775 0.019871078 -0.08963752
age -0.36176874 0.03615182 -0.173399716 -0.374733606 0.62706917
gender 0.06165902 0.01221713 -0.001603648 0.055468171 -0.09210945
SRhealth 0.34546038 -0.09340105 0.110085399 0.082730234 -0.31142292
smokes -0.15501884 -0.03771337 -0.096936630 0.005215303 0.03575933
exercise 1.00000000 -0.06217297 0.210204022 0.176766791 -0.32138385
BMI -0.06217297 1.00000000 -0.048914825 -0.075000576 0.01643219
education 0.21020402 -0.04891483 1.000000000 0.232321970 -0.17215791
parEdu 0.17676679 -0.07500058 0.232321970 1.000000000 -0.18796244
mortality -0.32138385 0.01643219 -0.172157913 -0.187962436 1.00000000
r_reshape_fun <- function(r){
coln <- colnames(r)
# remove lower tri and diagonal
r[lower.tri(r, diag = T)] <- NA
r %>% data.frame() %>%
rownames_to_column("V1") %>%
pivot_longer(
cols = -V1
, values_to = "r"
, names_to = "V2"
) %>%
mutate_at(vars(V1, V2), ~factor(., coln))
}
r_data <- r_data %>%
mutate(r = map(r, r_reshape_fun))
r_data$r[[1]]# A tibble: 100 × 3
V1 V2 r
<fct> <fct> <dbl>
1 p_value p_value NA
2 p_value age -0.00522
3 p_value gender 0.0536
4 p_value SRhealth 0.159
5 p_value smokes -0.0690
6 p_value exercise 0.0486
7 p_value BMI -0.0197
8 p_value education 0.00147
9 p_value parEdu 0.0199
10 p_value mortality -0.0896
# … with 90 more rows
This is, technically, a heat map, but I think we can do better!
Let’s add some intuitive colors using scale_fill_gradient2()
Do we need axis labels?
r_data$r[[1]] %>%
ggplot(aes(x = V1, y = V2, fill = r)) +
geom_raster() +
scale_fill_gradient2(limits = c(-1,1)
, breaks = c(-1, -.5, 0, .5, 1)
, low = "blue", high = "red"
, mid = "white", na.value = "white") +
labs(
x = NULL
, y = NULL
, fill = "Zero-Order Correlation"
, title = "Zero-Order Correlations Among Variables in Sample 1"
) +
theme_minimal()Let’s fix the theme elements. So close!
r_data$r[[1]] %>%
ggplot(aes(x = V1, y = V2, fill = r)) +
geom_raster() +
scale_fill_gradient2(limits = c(-1,1)
, breaks = c(-1, -.5, 0, .5, 1)
, low = "blue", high = "red"
, mid = "white", na.value = "white") +
labs(
x = NULL
, y = NULL
, fill = "Zero-Order Correlation"
, title = "Zero-Order Correlations Among Variables"
, subtitle = "Sample 1"
) +
theme_classic() +
theme(
legend.position = "bottom"
, axis.text = element_text(face = "bold")
, axis.text.x = element_text(angle = 45, hjust = 1)
, plot.title = element_text(face = "bold", hjust = .5)
, plot.subtitle = element_text(face = "italic", hjust = .5)
, panel.background = element_rect(color = "black", size = 1)
)Let’s fix the theme elements. So close!
r_data$r[[1]] %>%
ggplot(aes(x = V1, y = V2, fill = r)) +
geom_raster() +
geom_text(aes(label = round(r, 2))) +
scale_fill_gradient2(limits = c(-1,1)
, breaks = c(-1, -.5, 0, .5, 1)
, low = "blue", high = "red"
, mid = "white", na.value = "white") +
labs(
x = NULL
, y = NULL
, fill = "Zero-Order Correlation"
, title = "Zero-Order Correlations Among Variables"
, subtitle = "Sample 1"
) +
theme_classic() +
theme(
legend.position = "bottom"
, axis.text = element_text(face = "bold")
, axis.text.x = element_text(angle = 45, hjust = 1)
, plot.title = element_text(face = "bold", hjust = .5)
, plot.subtitle = element_text(face = "italic", hjust = .5)
, panel.background = element_rect(color = "black", size = 1)
)A correlelogram is basically a heat map that uses size in addition to color.
We’re going to skip the steps we took with a heat map. So close! Just need to get rid of that size legend.
r_data$r[[1]] %>%
ggplot(aes(x = V1, y = V2, fill = r, size = abs(r))) +
geom_point(shape = 21) +
scale_fill_gradient2(limits = c(-1,1)
, breaks = c(-1, -.5, 0, .5, 1)
, low = "blue", high = "red"
, mid = "white", na.value = "white") +
scale_size_continuous(range = c(3,14)) +
labs(
x = NULL
, y = NULL
, fill = "Zero-Order Correlation"
, title = "Zero-Order Correlations Among Variables"
, subtitle = "Sample 1"
) +
theme_classic() +
theme(
legend.position = "bottom"
, axis.text = element_text(face = "bold")
, axis.text.x = element_text(angle = 45, hjust = 1)
, plot.title = element_text(face = "bold", hjust = .5)
, plot.subtitle = element_text(face = "italic", hjust = .5)
, panel.background = element_rect(color = "black", size = 1)
)We’re going to skip the steps we took with a heat map. * So close! Just need to get rid of that size legend. * To do this, we’ll use the guides() function!
r_data$r[[1]] %>%
ggplot(aes(x = V1, y = V2, fill = r, size = abs(r))) +
geom_point(shape = 21) +
scale_fill_gradient2(limits = c(-1,1)
, breaks = c(-1, -.5, 0, .5, 1)
, low = "blue", high = "red"
, mid = "white", na.value = "white") +
scale_size_continuous(range = c(3,14)) +
labs(
x = NULL
, y = NULL
, fill = "Zero-Order Correlation"
, title = "Zero-Order Correlations Among Variables"
, subtitle = "Sample 1"
) +
guides(size = "none") +
theme_classic() +
theme(
legend.position = "bottom"
, axis.text = element_text(face = "bold")
, axis.text.x = element_text(angle = 45, hjust = 1)
, plot.title = element_text(face = "bold", hjust = .5)
, plot.subtitle = element_text(face = "italic", hjust = .5)
, panel.background = element_rect(color = "black", size = 1)
)\[ logit(\frac{\pi}{1-\pi}) = b_0 + b_1*C_{ij} + \epsilon_{ij} \]
ds1 <- pred_data$data[[4]] %>% filter(study == "Study1")
m1 <- glm(o_value ~ p_value, data = ds1, family = binomial(link = "logit"))
summary(m1)
Call:
glm(formula = o_value ~ p_value, family = binomial(link = "logit"),
data = ds1)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.2774 -1.0164 -0.9358 1.3244 1.4866
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.23213 0.25920 0.896 0.3705
p_value -0.09349 0.03632 -2.574 0.0101 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1117.4 on 830 degrees of freedom
Residual deviance: 1110.7 on 829 degrees of freedom
AIC: 1114.7
Number of Fisher Scoring iterations: 4
R are stored in lists or list-like objectsstr()
List of 30
$ coefficients : Named num [1:2] 0.2321 -0.0935
..- attr(*, "names")= chr [1:2] "(Intercept)" "p_value"
$ residuals : Named num [1:831] -1.68 -2.26 -1.5 -1.64 -1.64 ...
..- attr(*, "names")= chr [1:831] "1" "2" "3" "4" ...
$ fitted.values : Named num [1:831] 0.403 0.558 0.331 0.391 0.391 ...
..- attr(*, "names")= chr [1:831] "1" "2" "3" "4" ...
$ effects : Named num [1:831] 5.755 -2.574 -0.729 -0.78 -0.78 ...
..- attr(*, "names")= chr [1:831] "(Intercept)" "p_value" "" "" ...
$ R : num [1:2, 1:2] -14.1 0 -96.5 27.5
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "p_value"
.. ..$ : chr [1:2] "(Intercept)" "p_value"
$ rank : int 2
$ qr :List of 5
..$ qr : num [1:831, 1:2] -14.0555 0.0353 0.0335 0.0347 0.0347 ...
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : chr [1:831] "1" "2" "3" "4" ...
.. .. ..$ : chr [1:2] "(Intercept)" "p_value"
..$ rank : int 2
..$ qraux: num [1:2] 1.03 1.12
..$ pivot: int [1:2] 1 2
..$ tol : num 1e-11
..- attr(*, "class")= chr "qr"
$ family :List of 12
..$ family : chr "binomial"
..$ link : chr "logit"
..$ linkfun :function (mu)
..$ linkinv :function (eta)
..$ variance :function (mu)
..$ dev.resids:function (y, mu, wt)
..$ aic :function (y, n, mu, wt, dev)
..$ mu.eta :function (eta)
..$ initialize: language { if (NCOL(y) == 1) { ...
..$ validmu :function (mu)
..$ valideta :function (eta)
..$ simulate :function (object, nsim)
..- attr(*, "class")= chr "family"
$ linear.predictors: Named num [1:831] -0.391 0.232 -0.703 -0.443 -0.443 ...
..- attr(*, "names")= chr [1:831] "1" "2" "3" "4" ...
$ deviance : num 1111
$ aic : num 1115
$ null.deviance : num 1117
$ iter : int 4
$ weights : Named num [1:831] 0.241 0.247 0.222 0.238 0.238 ...
..- attr(*, "names")= chr [1:831] "1" "2" "3" "4" ...
$ prior.weights : Named num [1:831] 1 1 1 1 1 1 1 1 1 1 ...
..- attr(*, "names")= chr [1:831] "1" "2" "3" "4" ...
$ df.residual : int 829
$ df.null : int 830
$ y : Named num [1:831] 0 0 0 0 0 0 0 0 0 0 ...
..- attr(*, "names")= chr [1:831] "1" "2" "3" "4" ...
$ converged : logi TRUE
$ boundary : logi FALSE
$ model :'data.frame': 831 obs. of 2 variables:
..$ o_value: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
..$ p_value: num [1:831] 6.67 0 10 7.22 7.22 ...
..- attr(*, "terms")=Classes 'terms', 'formula' language o_value ~ p_value
.. .. ..- attr(*, "variables")= language list(o_value, p_value)
.. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. .. ..- attr(*, "dimnames")=List of 2
.. .. .. .. ..$ : chr [1:2] "o_value" "p_value"
.. .. .. .. ..$ : chr "p_value"
.. .. ..- attr(*, "term.labels")= chr "p_value"
.. .. ..- attr(*, "order")= int 1
.. .. ..- attr(*, "intercept")= int 1
.. .. ..- attr(*, "response")= int 1
.. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. .. ..- attr(*, "predvars")= language list(o_value, p_value)
.. .. ..- attr(*, "dataClasses")= Named chr [1:2] "factor" "numeric"
.. .. .. ..- attr(*, "names")= chr [1:2] "o_value" "p_value"
$ call : language glm(formula = o_value ~ p_value, family = binomial(link = "logit"), data = ds1)
$ formula :Class 'formula' language o_value ~ p_value
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
$ terms :Classes 'terms', 'formula' language o_value ~ p_value
.. ..- attr(*, "variables")= language list(o_value, p_value)
.. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:2] "o_value" "p_value"
.. .. .. ..$ : chr "p_value"
.. ..- attr(*, "term.labels")= chr "p_value"
.. ..- attr(*, "order")= int 1
.. ..- attr(*, "intercept")= int 1
.. ..- attr(*, "response")= int 1
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. ..- attr(*, "predvars")= language list(o_value, p_value)
.. ..- attr(*, "dataClasses")= Named chr [1:2] "factor" "numeric"
.. .. ..- attr(*, "names")= chr [1:2] "o_value" "p_value"
$ data : tibble [831 × 25] (S3: tbl_df/tbl/data.frame)
..$ study : chr [1:831] "Study1" "Study1" "Study1" "Study1" ...
..$ o_value : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
..$ p_year : num [1:831] 2005 2005 2005 2005 2005 ...
..$ SID : chr [1:831] "61215" "184965" "488251" "650779" ...
..$ p_value : num [1:831] 6.67 0 10 7.22 7.22 ...
..$ age : num [1:831] -29.925 -22.925 -3.925 -25.925 -0.925 ...
..$ gender : Factor w/ 2 levels "0","1": 2 1 2 2 2 1 1 2 1 2 ...
..$ grsWages : num [1:831] 1.021 1.139 0.717 0.644 0.812 ...
..$ parEdu : Factor w/ 3 levels "1","2","3": 2 2 1 3 2 2 1 1 2 2 ...
..$ race : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
..$ physhlthevnt: Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 2 2 2 ...
..$ SRhealth : num [1:831] 9.23 7.5 6.43 6.92 5.77 ...
..$ smokes : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 2 1 2 1 ...
..$ alcohol : Factor w/ 2 levels "0","1": NA NA NA NA NA NA NA NA NA NA ...
..$ exercise : num [1:831] 3.75 6.25 5 10 4.5 ...
..$ BMI : num [1:831] 2.02 1.84 1.67 1.89 3.29 ...
..$ parDivorce : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
..$ PhysFunc : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
..$ religion : Factor w/ 3 levels "0","1","2": 1 1 2 2 2 1 1 2 2 2 ...
..$ education : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
..$ married : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 2 2 1 1 ...
..$ numKids : num [1:831] 0 0 0.588 0 1.765 ...
..$ parOccPrstg : num [1:831] 3.67 5.41 1.21 4.66 4.99 ...
..$ reliability : num [1:831] 0.511 0.511 0.511 0.511 0.511 ...
..$ predInt : num [1:831] 13 13 13 13 13 13 13 13 13 13 ...
$ offset : NULL
$ control :List of 3
..$ epsilon: num 1e-08
..$ maxit : num 25
..$ trace : logi FALSE
$ method : chr "glm.fit"
$ contrasts : NULL
$ xlevels : Named list()
- attr(*, "class")= chr [1:2] "glm" "lm"
broom
broom package is great for working with models (and the broom.mixed add-on makes it even better)tidy()glance()augment()broom: tidy()
dplyr/tidyr, tidy() is a close contender with purrr::map() functions as my most used functionR provides the summary(), coef(), etc. to extract various components of the modeldata.frames, which are core input to a lot of other R functions across packagestidy() provides a data frame with core model coefficients, inferential tests, etc. that be easily matched and merged across models, etc.broom: tidy()
coef() function, but this still leaves us with the need to extract estimates of precision, like standard errors, confidence intervals, and more.broom: tidy()
broom::tidy()!broom: tidy()
broom::tidy()!broom: tidy()
tidy() data frame of the parmeter estimates for each sampletidy_ci <- function(m) tidy(m, conf.int = T)
nested_m <- pred_data$data[[4]] %>%
group_by(study) %>%
nest() %>%
ungroup() %>%
mutate(
m = map(data, ~glm(o_value ~p_value, data = ., family = binomial(link = "logit")))
, tidy = map(m, tidy_ci)
)
nested_m# A tibble: 6 × 4
study data m tidy
<chr> <list> <list> <list>
1 Study1 <tibble [831 × 24]> <glm> <tibble [2 × 7]>
2 Study2 <tibble [1,000 × 24]> <glm> <tibble [2 × 7]>
3 Study3 <tibble [1,000 × 24]> <glm> <tibble [2 × 7]>
4 Study4 <tibble [574 × 24]> <glm> <tibble [2 × 7]>
5 Study5 <tibble [616 × 24]> <glm> <tibble [2 × 7]>
6 Study6 <tibble [1,000 × 24]> <glm> <tibble [2 × 7]>
broom: tidy()
Now, we’ll drop the data and m columns that we don’t need and unnest() our tidy() data frames
# A tibble: 12 × 8
study term estimate std.error statistic p.value conf.low conf.high
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Study1 (Intercept) 0.232 0.259 0.896 0.370 -0.276 0.741
2 Study1 p_value -0.0935 0.0363 -2.57 0.0101 -0.165 -0.0225
3 Study2 (Intercept) 1.42 0.310 4.60 0.00000425 0.823 2.04
4 Study2 p_value -0.185 0.0392 -4.71 0.00000246 -0.262 -0.108
5 Study3 (Intercept) 1.32 0.328 4.03 0.0000565 0.685 1.97
6 Study3 p_value -0.166 0.0404 -4.11 0.0000390 -0.246 -0.0877
7 Study4 (Intercept) -1.62 0.464 -3.49 0.000482 -2.57 -0.741
8 Study4 p_value -0.0581 0.0898 -0.647 0.518 -0.232 0.121
9 Study5 (Intercept) -1.17 0.496 -2.36 0.0183 -2.16 -0.215
10 Study5 p_value -0.0391 0.0655 -0.597 0.550 -0.167 0.0905
11 Study6 (Intercept) 0.277 0.336 0.826 0.409 -0.380 0.937
12 Study6 p_value -0.0367 0.0437 -0.841 0.400 -0.123 0.0488
broom: tidy()
Now these parameters from multiple models, we may want to plot!
broom: tidy()
Almost, but we have two parameters for each model (Intercept and p_value), so let’s split those in a facet:
nested_m %>%
select(study, tidy) %>%
unnest(tidy) %>%
mutate_at(vars(estimate, conf.low, conf.high), exp) %>%
ggplot(
aes(y = study, x = estimate)
) +
geom_errorbar(
aes(xmin = conf.low, xmax = conf.high)
, position = position_dodge(width = .9)
, width = .1
) +
geom_point() +
facet_grid(~term) +
theme_classic()broom: tidy()
We’ve got some work to do to make this an intuitive figure. Let’s:
broom: tidy()
Add a dashed line at 1 (odd ratio of 1 is a null effect)
nested_m %>%
select(study, tidy) %>%
unnest(tidy) %>%
mutate_at(vars(estimate, conf.low, conf.high), exp) %>%
ggplot(
aes(y = study, x = estimate)
) +
geom_vline(aes(xintercept = 1), linetype = "dashed") +
geom_errorbar(
aes(xmin = conf.low, xmax = conf.high)
, position = position_dodge(width = .9)
, width = .1
) +
geom_point() +
facet_grid(~term, scales = "free") +
theme_classic()broom: tidy()
Make the points bigger
nested_m %>%
select(study, tidy) %>%
unnest(tidy) %>%
mutate_at(vars(estimate, conf.low, conf.high), exp) %>%
ggplot(
aes(y = study, x = estimate)
) +
geom_vline(aes(xintercept = 1), linetype = "dashed") +
geom_errorbar(
aes(xmin = conf.low, xmax = conf.high)
, position = position_dodge(width = .9)
, width = .1
) +
geom_point(size = 3, shape = 15) +
facet_grid(~term, scales = "free") +
theme_classic()broom: tidy()
Fix the titles on the plot and axis titles
nested_m %>%
select(study, tidy) %>%
unnest(tidy) %>%
mutate_at(vars(estimate, conf.low, conf.high), exp) %>%
ggplot(
aes(y = study, x = estimate)
) +
geom_vline(aes(xintercept = 1), linetype = "dashed") +
geom_errorbar(
aes(xmin = conf.low, xmax = conf.high)
, position = position_dodge(width = .9)
, width = .1
) +
geom_point(size = 3, shape = 15) +
labs(
x = "Estimate (CI) in OR"
, y = NULL
, title = "Conscientiousness was associated with mortality 50% of samples"
, subtitle = "Samples with lower mortality risk overall had fewer significant associations"
) +
facet_grid(~term, scales = "free") +
theme_classic()broom: tidy()
Add some color Fiddle with themes to make it prettier
nested_m %>%
select(study, tidy) %>%
unnest(tidy) %>%
mutate_at(vars(estimate, conf.low, conf.high), exp) %>%
ggplot(
aes(y = study, x = estimate, fill = study)
) +
geom_vline(aes(xintercept = 1), linetype = "dashed") +
geom_errorbar(
aes(xmin = conf.low, xmax = conf.high)
, position = position_dodge(width = .9)
, width = .1
) +
geom_point(size = 3, shape = 22) +
labs(
x = "Estimate (CI) in OR"
, y = NULL
, title = "Conscientiousness was associated with mortality 50% of samples"
, subtitle = "Samples with lower mortality risk overall had fewer significant associations"
) +
facet_grid(~term, scales = "free") +
theme_classic() +
theme(
legend.position = "none"
, axis.text = element_text(face = "bold", size = rel(1.1))
, axis.title = element_text(face = "bold", size = rel(1.2))
, axis.line = element_blank()
, strip.text = element_text(face = "bold", size = rel(1.1), color = "white")
, strip.background = element_rect(fill = "black")
, plot.title = element_text(face = "bold", size = rel(1.1), hjust = .5)
, plot.subtitle = element_text(face = "italic", size = rel(1.1))
, panel.border = element_rect(color = "black", fill = NA, size = 1)
)broom: tidy()
broom: tidy()
nested_m %>%
select(study, tidy) %>%
unnest(tidy) %>%
mutate_at(vars(estimate, conf.low, conf.high), exp) %>%
ggplot(
aes(y = study, x = estimate, fill = study)
) +
geom_vline(aes(xintercept = 1), linetype = "dashed") +
geom_errorbar(
aes(xmin = conf.low, xmax = conf.high)
, position = position_dodge(width = .9)
, width = .1
) +
geom_point(size = 3, shape = 22) +
labs(
x = "Estimate (CI) in OR"
, y = NULL
, title = "Conscientiousness was associated with mortality 50% of samples"
, subtitle = "Samples with lower mortality risk overall had fewer significant associations"
) +
facet_grid(~term, scales = "free") +
theme_classic() +
theme(
legend.position = "none"
, axis.text = element_text(face = "bold", size = rel(1.1))
, axis.title = element_text(face = "bold", size = rel(1.2))
, axis.line = element_blank()
, strip.text = element_text(face = "bold", size = rel(1.1), color = "white")
, strip.background = element_rect(fill = "black")
, plot.title = element_text(face = "bold", size = rel(1.1), hjust = .5)
, plot.subtitle = element_text(face = "italic", size = rel(1.1))
, panel.border = element_rect(color = "black", fill = NA, size = 1)
)broom: glance()
glance() function brings some of these important ones into a single objectbroom: glance()
# A tibble: 1 × 12
r.squared adj.r.squa…¹ sigma stati…² p.value df logLik AIC BIC devia…³
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.0503 0.0492 1.60 44.0 6.06e-11 1 -1571. 3148. 3162. 2133.
# … with 2 more variables: df.residual <int>, nobs <int>, and abbreviated
# variable names ¹adj.r.squared, ²statistic, ³deviance
broom: glance()
As before, we can do this with lots of models to compare across samples:
# A tibble: 6 × 5
study data m tidy glance
<chr> <list> <list> <list> <list>
1 Study1 <tibble [831 × 24]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
2 Study2 <tibble [1,000 × 24]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
3 Study3 <tibble [1,000 × 24]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
4 Study4 <tibble [574 × 24]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
5 Study5 <tibble [616 × 24]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
6 Study6 <tibble [1,000 × 24]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
broom: glance()
As before, we can do this with lots of models to compare across samples:
# A tibble: 6 × 9
study null.deviance df.null logLik AIC BIC deviance df.residual nobs
<chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
1 Study1 1117. 830 -555. 1115. 1124. 1111. 829 831
2 Study2 1386. 999 -682. 1367. 1377. 1363. 998 1000
3 Study3 1386. 999 -684. 1373. 1383. 1369. 998 1000
4 Study4 441. 573 -220. 445. 454. 441. 572 574
5 Study5 596. 615 -298. 600. 608. 596. 614 616
6 Study6 1386. 999 -693. 1390. 1399. 1386. 998 1000
broom: glance()
Realistically, this is the kind of info we table, but we can also merge it with info from tidy:
nested_m %>%
select(-data, -m) %>%
unnest(tidy) %>%
unnest(glance) %>%
mutate_if(is.numeric, ~round(., 2))# A tibble: 12 × 16
study term estim…¹ std.e…² stati…³ p.value conf.…⁴ conf.…⁵ null.…⁶ df.null
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Study1 (Inte… 0.23 0.26 0.9 0.37 -0.28 0.74 1117. 830
2 Study1 p_val… -0.09 0.04 -2.57 0.01 -0.17 -0.02 1117. 830
3 Study2 (Inte… 1.42 0.31 4.6 0 0.82 2.04 1386. 999
4 Study2 p_val… -0.18 0.04 -4.71 0 -0.26 -0.11 1386. 999
5 Study3 (Inte… 1.32 0.33 4.03 0 0.68 1.97 1386. 999
6 Study3 p_val… -0.17 0.04 -4.11 0 -0.25 -0.09 1386. 999
7 Study4 (Inte… -1.62 0.46 -3.49 0 -2.57 -0.74 441. 573
8 Study4 p_val… -0.06 0.09 -0.65 0.52 -0.23 0.12 441. 573
9 Study5 (Inte… -1.17 0.5 -2.36 0.02 -2.16 -0.21 596 615
10 Study5 p_val… -0.04 0.07 -0.6 0.55 -0.17 0.09 596 615
11 Study6 (Inte… 0.28 0.34 0.83 0.41 -0.38 0.94 1386. 999
12 Study6 p_val… -0.04 0.04 -0.84 0.4 -0.12 0.05 1386. 999
# … with 6 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
# df.residual <dbl>, nobs <dbl>, and abbreviated variable names ¹estimate,
# ²std.error, ³statistic, ⁴conf.low, ⁵conf.high, ⁶null.deviance
broom: augment()
Let’s keep working with our nested data frame. Remember, it looks like this:
# A tibble: 6 × 5
study data m tidy glance
<chr> <list> <list> <list> <list>
1 Study1 <tibble [831 × 24]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
2 Study2 <tibble [1,000 × 24]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
3 Study3 <tibble [1,000 × 24]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
4 Study4 <tibble [574 × 24]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
5 Study5 <tibble [616 × 24]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
6 Study6 <tibble [1,000 × 24]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
broom: augment()
augment() let’s us add (augment) the raw data we feed the model based on the fitted model# A tibble: 6 × 5
study data m tidy glance
<chr> <list> <list> <list> <list>
1 Study1 <tibble [831 × 31]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
2 Study2 <tibble [1,000 × 31]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
3 Study3 <tibble [1,000 × 31]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
4 Study4 <tibble [574 × 31]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
5 Study5 <tibble [616 × 31]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
6 Study6 <tibble [1,000 × 31]> <glm> <tibble [2 × 7]> <tibble [1 × 8]>
broom: augment()
glm:
.fitted: fitted / predicted value.se.fit: standard error.resid: observed - fitted.std.resd: standardized residuals.sigma: estimated residual SD when this obs is dropped from modelcooksd: Cooks distance (is this an outlier?)# A tibble: 831 × 10
o_value SID p_value .fitted .se.fit .resid .std.…¹ .hat .sigma .cooksd
<fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 61215 6.67 -0.391 0.0715 -1.02 -1.02 0.00123 1.16 4.17e-4
2 0 184965 0 0.232 0.259 -1.28 -1.29 0.0166 1.16 1.08e-2
3 0 488251 10 -0.703 0.134 -0.897 -0.899 0.00400 1.16 9.98e-4
4 0 650779 7.22 -0.443 0.0723 -0.996 -0.997 0.00125 1.16 4.01e-4
5 0 969691 7.22 -0.443 0.0723 -0.996 -0.997 0.00125 1.16 4.01e-4
6 0 986687 6.11 -0.339 0.0762 -1.04 -1.04 0.00141 1.16 5.04e-4
7 0 1054011 5.56 -0.287 0.0855 -1.06 -1.06 0.00179 1.16 6.74e-4
8 0 1372251 7.78 -0.495 0.0785 -0.976 -0.976 0.00145 1.16 4.44e-4
9 0 1496703 6.11 -0.339 0.0762 -1.04 -1.04 0.00141 1.16 5.04e-4
10 0 1897887 2.78 -0.0276 0.165 -1.17 -1.17 0.00677 1.16 3.34e-3
# … with 821 more rows, and abbreviated variable name ¹.std.resid
broom: augment()
For the most part, many of the checks with glm’s and lm’s are the same. But it’s a bit easier to wrap your head around lm(), so let’s switch to that:
nested_lm <- pred_data$data[[4]] %>%
select(study, SID, p_value, age, SRhealth) %>%
drop_na() %>%
group_by(study) %>%
nest() %>%
ungroup() %>%
mutate(m = map(data, ~lm(SRhealth ~ p_value + age, data = .))
, tidy = map(m, tidy_ci)
, glance = map(m, glance)
, data = map2(m, data, augment, se_fit = T, interval = "confidence"))
nested_lm# A tibble: 6 × 5
study data m tidy glance
<chr> <list> <list> <list> <list>
1 Study1 <tibble [831 × 13]> <lm> <tibble [3 × 7]> <tibble [1 × 12]>
2 Study2 <tibble [996 × 13]> <lm> <tibble [3 × 7]> <tibble [1 × 12]>
3 Study3 <tibble [1,000 × 13]> <lm> <tibble [3 × 7]> <tibble [1 × 12]>
4 Study4 <tibble [574 × 13]> <lm> <tibble [3 × 7]> <tibble [1 × 12]>
5 Study5 <tibble [616 × 13]> <lm> <tibble [3 × 7]> <tibble [1 × 12]>
6 Study6 <tibble [1,000 × 13]> <lm> <tibble [3 × 7]> <tibble [1 × 12]>
broom: augment()
lm:
.fitted: fitted / predicted value.se.fit: standard error.lower: lower bound of the confidence/prediction interval.upper: upper bound of the confidence/prediction interval.resid: observed - fitted.std.resd: standardized residuals.sigma: estimated residual SD when this obs is dropped from modelcooksd: Cooks distance (is this an outlier?)broom: augment()
broom: augment()
broom: augment()
Another is raw v. fitted
predict() or fitted() functions# A tibble: 100 × 5
p_value age fit lwr upr
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0 9.42 5.65 2.52 8.79
2 0.101 9.42 5.67 2.53 8.80
3 0.202 9.42 5.68 2.55 8.81
4 0.303 9.42 5.69 2.56 8.82
5 0.404 9.42 5.71 2.57 8.84
6 0.505 9.42 5.72 2.59 8.85
7 0.606 9.42 5.73 2.60 8.86
8 0.707 9.42 5.75 2.62 8.87
9 0.808 9.42 5.76 2.63 8.89
10 0.909 9.42 5.77 2.64 8.90
# … with 90 more rows
crossing(
p_value = seq(0, 10, .1)
, age = mean(d1$age)
) %>%
bind_cols(
.
, predict(m1, newdata = ., interval = "prediction")
) %>%
ggplot(aes(x = p_value, y = fit)) +
geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "seagreen4", alpha = .2) +
geom_line(color = "seagreen4", size = 2) +
theme_classic()Better scales
crossing(
p_value = seq(0, 10, .1)
, age = mean(d1$age)
) %>%
bind_cols(
.
, predict(m1, newdata = ., interval = "prediction")
) %>%
ggplot(aes(x = p_value, y = fit)) +
geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "seagreen4", alpha = .2) +
geom_line(color = "seagreen4", size = 2) +
scale_x_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) +
scale_y_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) +
theme_classic()Raw Data
crossing(
p_value = seq(0, 10, .1)
, age = mean(d1$age)
) %>%
bind_cols(., predict(m1, newdata = ., interval = "prediction")) %>%
ggplot(aes(x = p_value, y = fit)) +
geom_point(
data = d1
, aes(x = p_value, y = SRhealth)
, alpha = .4
, color = "seagreen4"
) +
geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "seagreen4", alpha = .2) +
geom_line(color = "seagreen4", size = 2) +
scale_x_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) +
scale_y_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) +
theme_classic()The usual aesthetics
crossing(
p_value = seq(0, 10, .1)
, age = mean(d1$age)
) %>%
bind_cols(., predict(m1, newdata = ., interval = "prediction")) %>%
ggplot(aes(x = p_value, y = fit)) +
geom_point(data = d1, aes(x = p_value, y = SRhealth)
, alpha = .4, color = "seagreen4") +
geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "seagreen4", alpha = .2) +
geom_line(color = "seagreen4", size = 2) +
scale_x_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) +
scale_y_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) +
labs(
x = "Conscientiousness (POMP; 0-10)"
, y = "Predicted Self-Rated Health (POMP; 0-10)"
, title = "Conscientiousness and Self-Rated Health\nWere Weakly Associated"
) +
theme_classic() +
theme(
axis.text = element_text(face = "bold", size = rel(1.1))
, axis.title = element_text(face = "bold", size = rel(1.1))
, plot.title = element_text(face = "bold", size = rel(1.2), hjust = .5)
)Now let’s do it for all of the samples
# A tibble: 6 × 6
study data m tidy glance pred
<chr> <list> <list> <list> <list> <list>
1 Study1 <tibble [831 × 13]> <lm> <tibble [3 × 7]> <tibble> <tibble>
2 Study2 <tibble [996 × 13]> <lm> <tibble [3 × 7]> <tibble> <tibble>
3 Study3 <tibble [1,000 × 13]> <lm> <tibble [3 × 7]> <tibble> <tibble>
4 Study4 <tibble [574 × 13]> <lm> <tibble [3 × 7]> <tibble> <tibble>
5 Study5 <tibble [616 × 13]> <lm> <tibble [3 × 7]> <tibble> <tibble>
6 Study6 <tibble [1,000 × 13]> <lm> <tibble [3 × 7]> <tibble> <tibble>
Now let’s do it for all of the samples
# A tibble: 600 × 6
study p_value age fit lwr upr
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Study1 0 9.42 5.65 2.52 8.79
2 Study1 0.101 9.42 5.67 2.53 8.80
3 Study1 0.202 9.42 5.68 2.55 8.81
4 Study1 0.303 9.42 5.69 2.56 8.82
5 Study1 0.404 9.42 5.71 2.57 8.84
6 Study1 0.505 9.42 5.72 2.59 8.85
7 Study1 0.606 9.42 5.73 2.60 8.86
8 Study1 0.707 9.42 5.75 2.62 8.87
9 Study1 0.808 9.42 5.76 2.63 8.89
10 Study1 0.909 9.42 5.77 2.64 8.90
# … with 590 more rows
Very close, but our intervals are cutoff
nested_lm %>%
select(study, pred) %>%
unnest(pred) %>%
ggplot(aes(x = p_value, y = fit)) +
geom_point(data = d1, aes(x = p_value, y = SRhealth)
, alpha = .2, color = "seagreen4") +
geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "seagreen4", alpha = .2) +
geom_line(color = "seagreen4", size = 2) +
scale_x_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) +
scale_y_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) +
labs(
x = "Conscientiousness (POMP; 0-10)"
, y = "Predicted Self-Rated Health (POMP; 0-10)"
, title = "Conscientiousness and Self-Rated Health\nWere Weakly Associated In Most Samples"
) +
facet_wrap(~study, ncol = 2) +
theme_classic() +
theme(
axis.text = element_text(face = "bold", size = rel(1.1))
, axis.title = element_text(face = "bold", size = rel(1.1))
, plot.title = element_text(face = "bold", size = rel(1.2), hjust = .5)
)nested_lm %>%
select(study, pred) %>%
unnest(pred) %>%
mutate(upr = ifelse(upr > 10, 10, upr)
, lwr = ifelse(lwr < 0, 0, lwr)) %>%
ggplot(aes(x = p_value, y = fit)) +
geom_point(data = d1, aes(x = p_value, y = SRhealth)
, alpha = .2, color = "seagreen4") +
geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "seagreen4", alpha = .2) +
geom_line(color = "seagreen4", size = 2) +
scale_x_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) +
scale_y_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) +
labs(
x = "Conscientiousness (POMP; 0-10)"
, y = "Predicted Self-Rated Health (POMP; 0-10)"
, title = "Conscientiousness and Self-Rated Health\nWere Weakly Associated In Most Samples"
) +
facet_wrap(~study, ncol = 2) +
theme_classic() +
theme(
axis.text = element_text(face = "bold", size = rel(1.1))
, axis.title = element_text(face = "bold", size = rel(1.1))
, plot.title = element_text(face = "bold", size = rel(1.2), hjust = .5)
, strip.background = element_rect(fill = "darkseagreen4")
, strip.text = element_text(face = "bold", color = "white")
)PSC 290 - Data Visualization